Part 1: Hypothesis Testing for Three Groups

1. Data Preprocessing

Data Import

raw_SKCM_data <- load('SKCM_data.rdata')
raw_SKCM_data
## [1] "raw_counts"   ".Random.seed" "samples"


Data Trimming

# Include only grouping variable of interest, cancer_stage & Mutation_count (for section 4)
# Remove other grouping variables.
SKCM_cancer_stage <- subset(samples, select = -c(Genome_altered_fraction,
                           Aneuploidy_score,
                           Buffa_hypoxia_score))
colnames(SKCM_cancer_stage)
## [1] "patient"           "sample"            "Diagnosis_age"    
## [4] "Sex"               "Cancer_stage"      "Mutation_count"   
## [7] "Nodal_involvement"
unique(SKCM_cancer_stage$Cancer_stage) 
##  [1] "STAGE IIIB"       NA                 "STAGE IB"         "STAGE IIC"       
##  [5] "STAGE IIB"        "STAGE II"         "STAGE I"          "STAGE I/II (NOS)"
##  [9] "STAGE IA"         "STAGE 0"          "STAGE IIA"        "STAGE IV"        
## [13] "STAGE IIIC"       "STAGE III"        "STAGE IIIA"


Data Cleaning

# Filter out N/A values, filter out 'other cancer' stages 
SKCM_cancer_stage <- SKCM_cancer_stage %>%
  filter(!is.na(Cancer_stage)) %>%
  filter(Cancer_stage != "STAGE I/II (NOS)") %>% #exclude this stage as informed by a3 matrix
  filter(Cancer_stage != "STAGE 0") %>% #exclude this stage as informed by a3 matrix
  filter(!is.na(Diagnosis_age)) %>% #Section 2 dependent var.
  filter(!is.na(Sex)) %>% #Section 3 dependent var.
  filter(!is.na(Mutation_count)) #Section 4 dependent var.

# check. The only result should be FALSEs, no TRUE for N/A and for excluded stages.
unique(is.na(SKCM_cancer_stage$Cancer_stage))
## [1] FALSE
unique(SKCM_cancer_stage$Cancer_stage == "STAGE I/II (NOS)")
## [1] FALSE
unique(SKCM_cancer_stage$Cancer_stage == "STAGE 0")
## [1] FALSE
unique(is.na(SKCM_cancer_stage$Sex))
## [1] FALSE
unique(is.na(SKCM_cancer_stage$Diagnosis_age))
## [1] FALSE


Cancer Stage Grouping

Here, the groupings are defined as:

  • Group 1: Stage I to Stage II (and their all sub-categories, if any)

  • Group 2: Stage III (and their all sub-categories, if any)

  • Group 3: Stage IV (and their all sub-categories, if any)

# Group cancer stage classifications into three groups. 
# Stage I-II (and sub-categories) -> Group 1
# Stage III (and sub-categories) -> Group 2
# Stage IV (and sub-cateogries) -> Group 3

SKCM_cancer_stage_grouped <- SKCM_cancer_stage %>%
  mutate(assignment_group = case_when(
          Cancer_stage %in% c("STAGE I",
                              "STAGE IA",
                              "STAGE IB",
                              "STAGE II",
                              "STAGE IIA",
                              "STAGE IIB", 
                              "STAGE IIC") ~ "Group 1",
         Cancer_stage %in% c("STAGE III",
                             "STAGE IIIA",
                             "STAGE IIIB",
                             "STAGE IIIC") ~ "Group 2",
         Cancer_stage == "STAGE IV" ~ "Group 3"))

SKCM_cancer_stage_grouped$assignment_group <- factor(SKCM_cancer_stage_grouped$assignment_group)

# Check if column has been converted to a factor.
is.factor(SKCM_cancer_stage_grouped$assignment_group) 
## [1] TRUE
unique(SKCM_cancer_stage_grouped$assignment_group)
## [1] Group 2 Group 1 Group 3
## Levels: Group 1 Group 2 Group 3


Number of samples per group

SKCM_cancer_stage_grouped %>%
  group_by(assignment_group) %>% 
  summarise('Total number of samples' = length(sample))
## # A tibble: 3 × 2
##   assignment_group `Total number of samples`
##   <fct>                                <int>
## 1 Group 1                                192
## 2 Group 2                                163
## 3 Group 3                                 21


2. Comparison of diagnosis age across three cancer stage groups

Data Visualisation

# a. Produce a plot (of your choice) to show the distribution of age at diagnosis for each group and report the group mean age;

SKCM_cancer_stage_grouped %>%
  # Y = Age
  # X = Group
  ggplot(aes(x = assignment_group, y = Diagnosis_age, fill = assignment_group)) +
  geom_violin(width = 0.7) +
  labs(x = "Cancer stage group", y = "Age (years)") +
  theme(axis.text.x = element_text(size = 10, 
                                   margin = margin(b=10)), # increase x-label spacing
        axis.ticks.x = element_blank(), # remove ticks of x axis
        axis.text.y = element_text(margin = margin(l=10)), # increase y-label spacing
        legend.position="none") +
  geom_boxplot(width = 0.06, fill="white")


Group mean age reported in table below:

SKCM_cancer_stage_grouped %>%
  group_by(assignment_group) %>%
  summarise('Mean diagnosis age' = mean(Diagnosis_age)) 
## # A tibble: 3 × 2
##   assignment_group `Mean diagnosis age`
##   <fct>                           <dbl>
## 1 Group 1                          58.7
## 2 Group 2                          58  
## 3 Group 3                          55.3


Statistical Testing

For this section, it is being tested if diagnosis age is different across the three different cancer stage groupings. To do this, a one-way ANOVA test will be performed. This test has 3 main assumptions: 1) Sample Independence, 2) Normality (or approximate normality), 3) Homoscedasticity.

Preliminary Tests: Normality

SKCM_cancer_stage_grouped %>%
  ggplot(aes(sample = Diagnosis_age)) +
  geom_qq() +
  geom_qq_line() +
  facet_wrap(~assignment_group, scales='free')

Groups 1 and 2 appear normal (follow the straight line representing a perfectly normal distribution) until the 75th percentile. Group 3 seems to not follow the line, hence may not be normally distributed.

# use Shapiro-Wilk test to test for normality in each group distribution
SKCM_cancer_stage_grouped %>%
  filter(assignment_group == 'Group 1') %>%
  dplyr::pull(Diagnosis_age) %>%
  shapiro.test()
## 
##  Shapiro-Wilk normality test
## 
## data:  .
## W = 0.97921, p-value = 0.005909
SKCM_cancer_stage_grouped %>%
  filter(assignment_group == 'Group 2') %>%
  dplyr::pull(Diagnosis_age) %>%
  shapiro.test()
## 
##  Shapiro-Wilk normality test
## 
## data:  .
## W = 0.97994, p-value = 0.01821
SKCM_cancer_stage_grouped %>%
  filter(assignment_group == 'Group 3') %>%
  dplyr::pull(Diagnosis_age) %>%
  shapiro.test()
## 
##  Shapiro-Wilk normality test
## 
## data:  .
## W = 0.90534, p-value = 0.04446

All Shapiro-Wilk tests for normality resulted in p-values < 0.05, suggesting that the distribution of all 3 groups do not follow a normal distribution. However, the W-statistic is quite close to 1 (where 1 is a perfect match to a normal distribution), implying that there is only a slight departure from normality in each group distribution. Therefore, a one-way ANOVA can still be used as the F-test is quite robust to moderate departures in normality.


Preliminary Tests: Homoscedasticity

Since Bartlett’s test is sensitive to departures from normality, Levene’s test (which is less sensitive to this) is used instead to test for homoscedasticity.

# Using Levene's Test to check for homoscedasticity
leveneTest(Diagnosis_age ~ assignment_group, data = SKCM_cancer_stage_grouped)
## Levene's Test for Homogeneity of Variance (center = median)
##        Df F value  Pr(>F)  
## group   2  2.3418 0.09757 .
##       373                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

This Levene test resulted in a p-value > 0.05, suggesting that there is evidence in favour of implying homoscedasticity between the cancer stage groups. Therefore, a one-way ANOVA test can be used to compare the mean diagnosis age across all three groups.


Null and Alternate Hypotheses:

Let the means of group 1, group 2 and group 3 be \(\mu_{1}\), \(\mu_{2}\), and \(\mu_{3}\) respectively. We can then state the following null hypothesis:

\[ H_{0}: (\mu_{1} = \mu_{2} = \mu_{3}) \]

Which states that there is no difference between group medians, and that they are all equal.

With this, we also state the following alternate hypothesis:

\[ H_{a}: \neg \space (\mu_{1} = \mu_{2} = \mu_{3}) \]

Which states that the group means are NOT all equal (negation of the equality denoted by \(\neg\)), and at least one group mean is different from the others.


One-way ANOVA Test

group_aov <- aov(Diagnosis_age ~ assignment_group, data = SKCM_cancer_stage_grouped)
summary(group_aov)
##                   Df Sum Sq Mean Sq F value Pr(>F)
## assignment_group   2    227   113.3   0.459  0.632
## Residuals        373  92037   246.8

This one-way ANOVA resulted in an F-statistic of 0.424 (with 2 degrees of freedom), which has an associated p-value of 0.654. With a defined significance level of 0.05, these results provide evidence for the null hypothesis as p > 0.05. This implies that there is no significant difference between group means.


Assumptions check for the one-way ANOVA test

Check assumptions:

Property/Assumption Evidence/justification for property violation/adherence
Independent Samples Each patient belongs to only one cancer stage grouping (each patient has only one type of cancer stage which belongs only to one group). \(\checkmark\)
Normality Slight departure from normality, as implied by Shapiro-Wilk tests on the distributions of each group. However, F-tests are robust against slight deviations from normality, therefore a one-way ANOVA could still be performed here. \(\times\)
Homoscedasticity (within-group) Using Levene’s test, the within-group variances are not significantly different. \(\checkmark\)


Conclusion

It can be concluded that there is no difference in mean diagnosis age between the three cancer stage groups.


3. Comparison of biological sex across three cancer stage groups

Statistical Testing

For this section, two categorical variables (sex and cancer stage grouping) are being tested to see if they have a relationship. To test this a chi-square test will be performed.


Contingency Table Construction

To perform a chi-square test, a contingency table comparing these two categorical variables needs to be constructed in the following/similar format:

Group 1 Group 2 Group 3
Female x x x
Male x x x
cont_sg <- xtabs(~Sex + assignment_group, data=SKCM_cancer_stage_grouped)
cont_sg
##         assignment_group
## Sex      Group 1 Group 2 Group 3
##   Female      69      64       8
##   Male       123      99      13

Therefore a contingency table of observed frequencies is as follows:

Group 1 Group 2 Group 3
Female 71 65 8
Male 123 100 3

.. and an associated contingency table of expected frequencies (in the case of a null relationship):

Group 1 Group 2 Group 3
Female 75.50 64.22 4.28
Male 118.50 100.78 6.72


Null and Alternate Hypotheses:

The following null hypothesis can then be stated:

\[ H_{0}: Cancer \space\ Stage \space Group \space\ and \space\ Sex \space\ are \space\ independent \]

Here, we expect the observed values to not be significantly different from the contingency table of expected frequencies (2nd table above).

The alternative hypothesis would be:

\[ H_{0}: Cancer \space\ Stage \space Group \space\ and \space\ Sex \space\ are \space\ not \space\ independent \]

Here, values would be may be too far departed from the expected frequency table, hence there would be evidence against the null hypothesis.


Chi-square Test

chisq.test(cont_sg)
## 
##  Pearson's Chi-squared test
## 
## data:  cont_sg
## X-squared = 0.41953, df = 2, p-value = 0.8108

This chi-square test resulted in an chi-squared statistic of 0.29656 (with 2 degrees of freedom), which has an associated p-value of 0.8622. With a defined significance level of 0.05, these results provide evidence for the null hypothesis as p > 0.05.


Assumptions check for the chi-square test

Check assumptions:

Property/Assumption Evidence/justification for property violation/adherence
Independent Samples

Each patient belongs to only one cancer stage grouping (each patient has only one type of cancer stage which belongs only to one group).

Likewise for sex, patients are only classified as male or female (not both).

\(\checkmark\)
Mutual exclusivity (one patient belongs to one cell in the contingency table) Patients are grouped into one type of cancer stage group, and one sex. Patients therefore belong into one stage-sex combination cell. \(\checkmark\)
Expected values > 5 in at least 80% of the cells. 5 out of 6 (83.33%) cells in the expected frequency/count contingency table (see above; contingency table construction) have a value greater than 5. \(\checkmark\)

Conclusion

It can be concluded that cancer stage grouping and biological sex are independent categorical variables.


4. Comparison of mutation count across three cancer stage groups

Data Visualisation

# a. Produce a plot (of your choice) to show the distribution of age at diagnosis for each group and report the group mean age;

SKCM_cancer_stage_grouped %>%
  # Y = Age
  # X = Group
  ggplot(aes(x = assignment_group, y = Mutation_count, fill = assignment_group)) +
  geom_violin(width = 0.7) +
  labs(x = "Cancer stage group", y = "Mutation count") +
  theme(axis.text.x = element_text(size = 10, 
                                   margin = margin(b=10)), # increase x-label spacing
        axis.ticks.x = element_blank(), # remove ticks of x axis
        axis.text.y = element_text(margin = margin(l=10)), # increase y-label spacing
        plot.title = element_text(size = 20,  # larger font size
                                  face = "bold", # bold text
                                  hjust = 0.5), # centred
        legend.position="none") +
  geom_boxplot(width = 0.06, fill="white")

These plots imply that the data are very skewed towards higher mutation counts and do not follow a normal distribution. However, performing a natural log (ln) transformation on Mutation_counts results in the following distributions, shown below:

# a. Produce a plot (of your choice) to show the distribution of age at diagnosis for each group and report the group mean age;

SKCM_cancer_stage_grouped %>%
  ggplot(aes(x = assignment_group, y = log(Mutation_count), fill = assignment_group)) +
  geom_violin(width = 0.7) +
  labs(x = "Cancer stage group", y = "ln(Mutation count)") +
  theme(axis.text.x = element_text(size = 10, 
                                   margin = margin(b=10)), # increase x-label spacing
        axis.ticks.x = element_blank(), # remove ticks of x axis
        axis.text.y = element_text(margin = margin(l=10)), # increase y-label spacing
        plot.title = element_text(size = 20,  # larger font size
                                  face = "bold", # bold text
                                  hjust = 0.5), # centred
        legend.position="none") +
  geom_boxplot(width = 0.06, fill="white")

These distributions now seem to follow a normal distribution.


Statistical Testing

For this section, it is being tested if mutation count is different across the three different cancer stage groupings.

Preliminary Tests: Normality

SKCM_cancer_stage_grouped %>%
  ggplot(aes(sample = log(Mutation_count))) +
  geom_qq() +
  geom_qq_line() +
  facet_wrap(~assignment_group, scales='free')

Most of the data does seem to follow the trend-line which represents a perfect normal distribution, although more of the distribution of group 3 seems to deviate more from the line, but may be due to its more limited sample size. This deviation may or may not be significant, hence to further test for normality, a Shapiro-Wilk statistical test will be also be performed.

# use Shapiro-Wilk test to test for normality in each group distribution

# For each group, 
SKCM_cancer_stage_grouped %>%
  filter(assignment_group == 'Group 1') %>% #Subset by group
  mutate(log_mut_count = log(Mutation_count)) %>% #add a column with ln(mutation_count)
  dplyr::pull(log_mut_count) %>% #get only this column
  shapiro.test() #perform normality test
## 
##  Shapiro-Wilk normality test
## 
## data:  .
## W = 0.98282, p-value = 0.0188
SKCM_cancer_stage_grouped %>%
  filter(assignment_group == 'Group 2') %>%
  mutate(log_mut_count = log(Mutation_count)) %>%
  dplyr::pull(log_mut_count) %>%
  shapiro.test()
## 
##  Shapiro-Wilk normality test
## 
## data:  .
## W = 0.99023, p-value = 0.325
SKCM_cancer_stage_grouped %>%
  filter(assignment_group == 'Group 3') %>%
  mutate(log_mut_count = log(Mutation_count)) %>%
  dplyr::pull(log_mut_count) %>%
  shapiro.test()
## 
##  Shapiro-Wilk normality test
## 
## data:  .
## W = 0.90995, p-value = 0.0548

The Shapiro-Wilk normality tests provide evidence for normality in only group 2 but not for group 1 and 3 (p < 0.05). However, like before, these are only slight departures for normality with W scores being close to 1 (around 0.98, and 0.90 for group 1 and 3 respectively). Therefore, a one-way ANOVA can still be performed to compare the log of mutation counts across the three cancer stage groups, as it it robust against slight departures from normality.


Preliminary Tests: Homoscedasticity

# Using Levene's Test to check for homoscedasticity
leveneTest(log(Mutation_count) ~ assignment_group, data = SKCM_cancer_stage_grouped)
## Levene's Test for Homogeneity of Variance (center = median)
##        Df F value  Pr(>F)  
## group   2  4.4333 0.01251 *
##       373                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

This Levene test resulted in a p-value < 0.05, providing evidence against the null hypothesis of homoscedasticity. This implies that the data is heteroscedastic. Heteroscedasticity in this case may be caused by unequal sample sizes, where group 3 has very limited samples (21 samples) compared to group 1 (194 samples) and group 2 (165 samples). Therefore, to compare the groups, a Welch’s ANOVA test will be used instead of a one-way ANOVA.


Null and Alternate Hypotheses:

Let the means of group 1, group 2 and group 3 be \(\mu_{1}\), \(\mu_{2}\), and \(\mu_{3}\) respectively. We can then state the following null hypothesis:

\[ H_{0}: (\mu_{1} = \mu_{2} = \mu_{3}) \]

Which states that there is no difference between group medians, and that they are all equal.

With this, we also state the following alternate hypothesis:

\[ H_{a}: \neg \space (\mu_{1} = \mu_{2} = \mu_{3}) \]

Which states that the group means are NOT all equal (negation of the equality denoted by \(\neg\)), and at least one group mean is different from the others.


Welch’s ANOVA Test

# Set var.equal = FALSE to perform a Welch's ANOVA test
group_aov_welch <- aov(log(Mutation_count) ~ assignment_group, 
                 data = SKCM_cancer_stage_grouped,
                 var.equal = FALSE)
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'var.equal' will be disregarded
summary(group_aov_welch)
##                   Df Sum Sq Mean Sq F value  Pr(>F)   
## assignment_group   2   15.1   7.575   5.271 0.00553 **
## Residuals        373  536.0   1.437                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

This Welch’s ANOVA test resulted in an F-statistic of 5.271 (with 2 degrees of freedom), which has an associated p-value of 0.00553. With a defined significance level of 0.05, these results provide evidence against the null hypothesis, in favour of the alternate hypothesis as p < 0.05. This implies that there is as significant difference between group means.


Assumptions check for Welch’s ANOVA test

Check assumptions:

Property/Assumption Evidence/justification for property violation/adherence
Independent Samples Each patient belongs to only one mutation count grouping. \(\checkmark\)
Normality Shapiro-Wilk tests imply normality only for group 2. ANOVAs are robust against slight departures from normality. Moreover, the data has unequal variances among the groups, which makes a Welch’s ANOVA more appropriate to use as it does not assume equal variances. \(\times\)


Post-hoc tests: Tukey-Kramer Procedure

Given that the data is heteroscedastic (likely due to an unbalanced design), the Tukey-Kramer post-hoc procedure will be used to perform pair-wise comparisons between group means.

TukeyHSD(group_aov_welch)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = log(Mutation_count) ~ assignment_group, data = SKCM_cancer_stage_grouped, var.equal = FALSE)
## 
## $assignment_group
##                       diff        lwr         upr     p adj
## Group 2-Group 1 -0.3163384 -0.6167622 -0.01591472 0.0363285
## Group 3-Group 1 -0.7101723 -1.3584992 -0.06184548 0.0278221
## Group 3-Group 2 -0.3938339 -1.0478221  0.26015435 0.3331170

These results suggest that there is a difference when comparing group 1 with group 2 and group 1 with group 3 (p-adjusted for multiple comparisons < 0.05). The post-hoc tests suggest that the group 1 mean is approximately \(e^{0.31}\) (1.36) counts larger than the group 2 mean, and \(e^{0.71}\) (2.03) counts larger than the group 3 mean.


Conclusion

It can be concluded that the mean mutation count in group 1 is larger than the mean mutation count in group 2 by 0.31 counts, and in group 3 by 0.71 counts. Furthermore, this implies that patients earlier stage cancer (representative of group 1) may have greater mutation counts than later stage cancer patients (representative of group 2 and 3).


Part 2: Differential Gene Expression Analysis

1. edgeR Object Creation, Filtering and Normalisation

Object Creation with DGEList

# Vector of samples to include in DGElist. 
# rownames corresponds to cols in raw_counts (full patient id)
# i.e. TCGA-D3-A51F-06A-11R-A266-07
patients_include <- rownames(SKCM_cancer_stage_grouped)

# Get only the gene counts for the included patients 
raw_counts_include <- raw_counts[,patients_include]

# Construct the edgeR object. Groups specified by assignment_group
SKCM_edgeR <- DGEList(counts=raw_counts_include,  
                        samples=SKCM_cancer_stage_grouped, ###################HANGE?skcm$sample
                        group=SKCM_cancer_stage_grouped$assignment_group)

#check that our cancer stage grouping factor is there
head(SKCM_edgeR$samples$group)
## [1] Group 2 Group 1 Group 2 Group 1 Group 1 Group 1
## Levels: Group 1 Group 2 Group 3


Filtering: Exclude lowly expressed genes (min.count = 10)

# Using the default value for min.count = 10 
kept_genes <- filterByExpr(SKCM_edgeR) 

SKCM_edgeR_filtered <- SKCM_edgeR[kept_genes, keep.lib.sizes=FALSE]

dim(SKCM_edgeR$counts)
## [1] 19550   376
dim(SKCM_edgeR_filtered$counts)
## [1] 17134   376


Number of genes before and after filtering

Pre-filtering: No. of genes = 19550

Post-filtering: No. of genes = 17144

2046 genes filtered out


Normalisation

SKCM_edgeR_filtered_norm <- calcNormFactors(
                      SKCM_edgeR_filtered, #data of interest
                      method = c("TMM","TMMwsp","RLE","upperquartile","none"), #scaling method
                      refColumn = NULL, # reference column for chosen method
                      logratioTrim = .3, #proportion of observations to trim from tail. M-val 
                      sumTrim = 0.05,  #proportion of observations to trim from tail. A-vals
                      doWeighting = TRUE, #used by TMM. use weights when computing mean M vals
                      Acutoff = -1e10, #minimum cutoff for A-vals
                      p = 0.75 #quantile of counts used by method = upperquartile
                )


2. Transformation after filtering and normalisation

Transformation

# Log2 CPM transform the normalised values
SKCM_filtered_norm_lcpm <- SKCM_edgeR_filtered_norm %>%
  cpm(log=TRUE, prior.count=1)

Visualisation of first 50 patients

SKCM_filtered_norm_lcpm[,1:50] %>% 
  as.data.frame() %>% 
  rownames_to_column(var="gene") %>% 
  gather(key="Sample", value = "logCPM",-gene) %>% 
  ggplot(aes(x = `logCPM`, colour=Sample)) +
  geom_density() + 
  theme(legend.position = 'none') +
  xlim(-10,15)
## Warning: Removed 4 rows containing non-finite values (stat_density).

SKCM_filtered_norm_lcpm[,1:50] %>%
  as.data.frame() %>%
  rownames_to_column(var="gene") %>%
  gather(key="Sample", value = "logCPM",-gene) %>%
  ggplot(aes(y = `logCPM`, colour=Sample)) +
  geom_boxplot() + 
  theme(legend.position = 'none', 
        axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank()
        ) 


3. Construct Design Matrix, Estimate Dispersion, Fit Generalised Linear Model

Design Matrix Construction

# Compare 3 groups; ~group
# includes intercept term.
SKCM_design <- model.matrix(~group, SKCM_edgeR_filtered_norm$samples)


Estimate Dispersion

SKCM_edgeR_filtered_norm <- estimateDisp(SKCM_edgeR_filtered_norm, SKCM_design)

plotBCV(SKCM_edgeR_filtered_norm)


Generalised Linear Model Fitting

SKCM_glm_fit <- glmFit(SKCM_edgeR_filtered_norm, SKCM_design)
#log2(0.5) -> FC = 0.5, 2^0.5 ~= 1.4x 
# All coefficients are relative to first factor (which is group 1)
# Coef = 3, compares group 3 to group 1
SKCM_glm_treat <- glmTreat(SKCM_glm_fit, coef=3, lfc=0.5)
summary(decideTests(SKCM_glm_treat))
##        groupGroup 3
## Down             39
## NotSig        16975
## Up              120
plotMD(SKCM_glm_treat)
abline(h=c(-0.5, 0.5), col="blue")


4. Group 3 and Group 1 Gene Expression Comparison

There are 121 up-regulated and 40 down-regulated genes in group 3, compared to group 1.


5. Top 10 most differentially expressed genes

  • Sorted by p-value
topTags(SKCM_glm_treat, n=10, sort.by="PValue") # get the top 10 most significant diffexp genes
## Coefficient:  groupGroup 3 
##                             logFC unshrunk.logFC     logCPM       PValue
## NTRK1|ENSG00000198400    4.216943       4.220610  0.8709234 3.585405e-46
## GFAP|ENSG00000131095     4.997463       5.003804  0.4605904 1.965199e-29
## SMOC2|ENSG00000112562    3.074526       3.074789  3.9941394 2.251147e-24
## MN1|ENSG00000169184      3.101171       3.102036  2.2932319 8.065887e-24
## PLXDC1|ENSG00000161381   2.613640       2.613865  3.9712456 5.495807e-23
## SLC22A16|ENSG00000004809 3.322197       3.347435 -2.1973356 1.145805e-21
## ARRDC4|ENSG00000140450   2.548012       2.548087  5.4843581 6.294644e-20
## SLC16A12|ENSG00000152779 3.527093       3.539542 -1.4622004 7.506141e-20
## EBF2|ENSG00000221818     3.023655       3.024885  1.6311248 8.447542e-20
## NKX6-3|ENSG00000165066   5.536704       5.658302 -2.5329966 3.753254e-19
##                                   FDR
## NTRK1|ENSG00000198400    6.143234e-42
## GFAP|ENSG00000131095     1.683586e-25
## SMOC2|ENSG00000112562    1.285705e-20
## MN1|ENSG00000169184      3.455023e-20
## PLXDC1|ENSG00000161381   1.883303e-19
## SLC22A16|ENSG00000004809 3.272036e-18
## ARRDC4|ENSG00000140450   1.540749e-16
## SLC16A12|ENSG00000152779 1.607628e-16
## EBF2|ENSG00000221818     1.608224e-16
## NKX6-3|ENSG00000165066   6.430825e-16

The top 10 differentially expressed genes sorted by p-value are:

  • NTRK1

  • GFAP

  • SMOC2

  • MN1

  • PLXDC1

  • SLC22A16

  • EBF2

  • SLC16A12

  • ARRDC4

  • NKX6-3


Part 3: Principal Component Analysis (PCA)

1. Perform PCA on all genes

# Tranpose data, so rows represent subjects and columns represent genes.
trans_SKCM_filtered_norm_lcpm <- t(SKCM_filtered_norm_lcpm)

# Compute pca, with 10 components. Center and scale data.
SKCM_pca <- pca(ncomp=10,
                trans_SKCM_filtered_norm_lcpm,
                center=TRUE,
                scale=TRUE)


2. Plot the first two PCs, coloured by biological groups

# The corresponding Y labels can be found as a column from SCKM_cancer_stage_grouped
# SKCM_cancer_stage_grouped$assignment_group

plotIndiv(
  SKCM_pca, 
  comp = 1:2,# plot pc1 and pc2
  group = SKCM_cancer_stage_grouped$assignment_group, # Y-labels/Group labels 
  ind.names = FALSE, 
  ellipse = TRUE, 
  legend = TRUE, 
  title = 'PCA on SKCM_filtered_norm_lcpm'
)

The principal components (PC1 and PC2) explaining the most variance in the data does not seem to be related to cancer stage (biological) grouping, as there is no clear separation between the different coloured groups. This means that PC1 and PC2, may be related to other variables/groupings (with either biological or non-biological relevance). It seems to cluster based on the density of data points, rather than biological group.


3. Justification for 2 PCs being insufficient

pc_eigenvalues <- SKCM_pca$sdev^2

pc_eigenvalues <- data_frame(PC = factor(1:length(pc_eigenvalues)), 
                         variance = pc_eigenvalues) %>% 
  # add a new column with the percent variance
  mutate(pct = variance/sum(variance)*100) %>% 
  # add another column with the cumulative variance explained
  mutate(pct_cum = cumsum(pct))
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
pc_eigenvalues %>% 
  ggplot(aes(x = PC)) + #x-axis
  geom_col(aes(y = pct)) + #y-axis = variance explained by each pc
  geom_line(aes(y = pct_cum, group = 1)) + #the cumulative variance with more pcs added
  geom_point(aes(y = pct_cum)) + 
  labs(x = "Principal component", y = "Fraction variance explained")

Plotting a pareto chart (as shown above) indicates that the first two principal components only explain 50% of variance in the data. More variance can be explained with more principal components (as shown, where the adding more principal component increases cumulative variance). Moreover, the biological groupings of cancer stages (which is the research question) does not seem to be captured by these principal components, and may be captured by other principal components.


Part 4: Clustering and Classification

1. K-means clustering on samples, using all differentially expressed genes

#DGE_cluster_k2 <- kmeans(t(hclust_m), centers = 2, iter.max = 100, nstart = 25) 

# get the ids of group 1
# get the ids of group 3
# store into group_1_3_ids
group_1_3_ids <- SKCM_cancer_stage_grouped %>%
  filter(assignment_group == 'Group 1' | assignment_group == 'Group 3') %>%
  rownames()

# use the ids to subset SKCM_filtered_norm_lcpm columns
SKCM_DGE_1_3 <- SKCM_filtered_norm_lcpm[,group_1_3_ids]

#transpose so genes are columns. then scale the matrix columns (genes). 
# Ensure rows are samples, since clustering based on samples
SKCM_DGE_1_3_scaled <- SKCM_DGE_1_3 %>%
  t() %>%
  scale() # then scale data
  
# perform K-means clustering, with defined parameters in assignment. 
DGE_cluster_k2 <- kmeans(SKCM_DGE_1_3_scaled, centers=2, nstart=25, iter.max=100)

dge_clusters <- DGE_cluster_k2$cluster # cluster that gene belongs to
dge_centers <- DGE_cluster_k2$centers #Distance of cluster 1/2 to a sample
dge_pts <-DGE_cluster_k2$size # number of points per cluster
# do a PCA, but only on group 1 and 3. 
skcm_pca_1_3 <- prcomp(SKCM_DGE_1_3_scaled,  scale = TRUE)
# get the indices of each sample
skcm_coord <- as.data.frame(get_pca_ind(skcm_pca_1_3)$coord)
# add the cluster as a factor 
skcm_coord$cluster <- factor(DGE_cluster_k2$cluster)
# Add group 1 and 3 indices to subset the skcm_coord
skcm_coord$assignment_group <- SKCM_cancer_stage_grouped %>%
  filter(assignment_group != 'Group 2') %>%
  dplyr::pull(assignment_group)

# Extract eigenvalues to get percent variance explained by pc1 and pc2
SKCM_eigenvalue_1_3 <- round(get_eigenvalue(skcm_pca_1_3), 1)
SKCM_var_perc <- SKCM_eigenvalue_1_3$variance.percent

ggscatter(
  skcm_coord, x = "Dim.1", y = "Dim.2", #same as PC1 and PC2
  color = "cluster", #colour by cluster group
  palette = "npg", 
  ellipse = TRUE, 
  ellipse.type = "convex", 
  shape = "assignment_group", # shape by assignment group
  size = 1.5,  
  legend = "right", 
  ggtheme = theme_bw(),
  xlab = paste0("PC1 (", SKCM_var_perc[1], "% )" ),
  ylab = paste0("PC2 (", SKCM_var_perc[2], "% )" )
) +
  stat_mean(aes(color = cluster), size = 4)

Like discussed before, it seems to form cluster centres, based on the density/spareness of the data points, where cluster 1, has a more dense distribution than cluster 2. This density property does not separate the biological groups.


2. Proportion of group 1 and group 3 samples clustered together

group_1_ids <- SKCM_cancer_stage_grouped %>%
  filter(assignment_group == 'Group 1') %>%
  rownames()

group_3_ids <- SKCM_cancer_stage_grouped %>%
  filter(assignment_group == 'Group 3') %>%
  rownames()

sum(dge_clusters[group_1_ids] == 1) #Number of samples from group 1 clustered into cluster 1
## [1] 34
sum(dge_clusters[group_3_ids] == 1) #Number of samples from group 3 clustered into cluster 1
## [1] 1
dge_pts[1] # Total group 1samples 
## [1] 35
sum(dge_clusters[group_1_ids] == 2) #Number of samples from group 1 clustered into cluster 2
## [1] 158
sum(dge_clusters[group_3_ids] == 2) #Number of samples from group 3 clustered into cluster 2
## [1] 20
dge_pts[2] # Total samples in cluster 2
## [1] 178

From the k-means clustering of k=2:

  • 34 group 1 samples clustered in cluster 1 (97.1%)

  • 1 group 3 sample clustered in cluster 1 (2.9%)

  • 158 group 1 samples clustered in cluster 2 (88.8%)

  • 20 group 3 samples clustered in cluster 2 (11.2%)

This meant that:

  • 35/213 = 16.4% of all samples clustered into cluster 1

  • 178/213 = 83.6% of all samples clustered into cluster 2

It is unclear if K-means clustering here clustered based on cancer stage grouping, but it is notable that a larger proportion of group 3 samples were clustered/represented in cluster 2 (11.2%), compared to cluster 1 (2.9%).


3. Hierarchical clustering

# using squared eucledian distance, and Ward's agglomeration method

# This matrix, rows are samples
sq_euc_dist_SKCM_DGE_1_3_scaled <- dist(x=SKCM_DGE_1_3_scaled, method="euclidean")**2

# Perform hierarchical clustering on the samples, based on squared euc distance, using 
# Ward's agglomeration method. Agglomerative = bottom-up
hier_clust_SKCM_1_3 <- hclust(sq_euc_dist_SKCM_DGE_1_3_scaled, method="ward.D")
# Plot the clustering, removing labels for clarity.
plot(hier_clust_SKCM_1_3, labels=FALSE)

The dendrogram above indicates some form of structure or hierarchy in the data. Given that we are interested in two groups; group 1 and group 3, this may perhaps be present when cutting the dendrogram into two groups.


4. First split in the hierarchy

# Plot the dendrogram
plot(hier_clust_SKCM_1_3, labels=FALSE)
rect.hclust(hier_clust_SKCM_1_3, k = 2, border = "red")

Then cutting the tree:

hier_clust_SKCM_1_3_cut <- cutree(hier_clust_SKCM_1_3, k = 2)
sum(hier_clust_SKCM_1_3_cut[group_1_ids] == 1)
## [1] 42
sum(hier_clust_SKCM_1_3_cut[group_3_ids] == 1)
## [1] 3
sum(hier_clust_SKCM_1_3_cut[group_1_ids] == 2)
## [1] 150
sum(hier_clust_SKCM_1_3_cut[group_3_ids] == 2)
## [1] 18
table(hier_clust_SKCM_1_3_cut)
## hier_clust_SKCM_1_3_cut
##   1   2 
##  45 168
# For rough visualisation of this,
# Label node leaves (terminal leaves) by colour. BLACK = Group 1, GREEN = Group 3
dend_SKCM <- as.dendrogram(hier_clust_SKCM_1_3)
gr1_patients <- rownames(
  SKCM_cancer_stage_grouped
  )[SKCM_cancer_stage_grouped$assignment_group == 'Group 1']
col_gr1_red <- ifelse(labels(dend_SKCM) %in% gr1_patients, "black", "green")
labelled_dend_SKCM <- assign_values_to_leaves_edgePar(dend=dend_SKCM, value=col_gr1_red, edgePar = "col")
labels(labelled_dend_SKCM) <- NULL #turn off labels
## Warning in `labels<-.dendrogram`(`*tmp*`, value = NULL): The lengths of the
## new labels is shorter than the number of leaves in the dendrogram - labels are
## recycled.
## Warning in rep(new_labels, length.out = leaves_length): 'x' is NULL so the
## result will be NULL
plot(labelled_dend_SKCM, main = "Cluster Dendrogram labelled by cancer group")

From this hierarchical clustering:

  • 42 group 1 samples clustered in cluster 1 (93.3%)

  • 3 group 3 samples clustered in cluster 1 (6.7%)

  • 150 group 1 samples clustered in cluster 2 (89.3%)

  • 18 group 3 samples clustered in cluster 2 (10.7%)

This meant that:

  • 45/213 = 21.1% of all samples clustered into cluster 1

  • 168/213 = 78.9% of all samples clustered into cluster 2


5. Better approach for clustering

This data set has very unbalanced classes, where there are only 21 group 3 samples, and 192 group 1 samples. This means that k-means would be less suited for clustering samples here as iterative calculation of the two centroids would be skewed towards group 1 samples (as random starts would more likely fall in the proximity of group 1 samples, and centroid means would be skewed towards these samples, due to their greater abundance).

Hierarchical clustering, especially with an agglomerative method (such as Ward’s agglomerative method used here) would be more appropriate in this scenario, as the method treats each sample as a node/cluster, and combines clusters ‘bottom-up’, rather than being based off skewed centroids as discussed.


Part 5: Correlation of methylation and gene expression

1. Gene of interest: MGMT

# subset to only include entries for MGMT row
SKCM_MGMT <- SKCM_filtered_norm_lcpm['MGMT|ENSG00000170430',] 

# Get the patients meta-data. Add MGMT logcpm as a column
SKCM_patients_MGMT <- SKCM_cancer_stage_grouped %>%
  mutate(MGMT_norm_lcpm = SKCM_filtered_norm_lcpm['MGMT|ENSG00000170430',])

# Plot boxplot of all 3 groups, according to logCPM expression of MGMT
SKCM_patients_MGMT %>%
  ggplot(aes(x = assignment_group, y = MGMT_norm_lcpm, fill = assignment_group)) +
   labs(
     x = "Cancer stage group", 
     y = "logCPM", 
   ) +
  geom_boxplot()

Not in the list of top 10 (nor top 50) most significantly expressed genes.


2 and 3. Correlation between MGMT expression and methylation data

methylation_data <- read.csv('Methylation_TCGA.csv')

# Filter for SKCM cancer type, and relevant gene MGMT. Filter out na values for beta_val
SKCM_methylation_data <- methylation_data %>%
  filter(Cancer_type == 'SKCM') %>%
  filter(Gene == 'MGMT') %>%
  filter(!is.na(beta_val))

# Should only have SKCM and MGMT
unique(SKCM_methylation_data$Cancer_type)
## [1] "SKCM"
unique(SKCM_methylation_data$Gene)
## [1] "MGMT"
unique(SKCM_methylation_data$probe)
## [1] "cg14194875" "cg00618725" "cg00198994" "cg13171643" "cg26976729"
## [6] "cg11019008" "cg00149734" "cg07367735"
# 8 probes
methyl_SKCM_patients_MGMT <- merge(SKCM_patients_MGMT, SKCM_methylation_data, by='patient')
# multiple patient/sample entries now, due to multiple probes. Merge by common column, patient 

methyl_SKCM_patients_MGMT %>%
  ggplot(aes(x=beta_val, y=MGMT_norm_lcpm)) + #Gene expression will be dependent on beta_val 
  geom_point(size=0.1) + # reduce size for visual clarity
  geom_smooth(method = "lm") +
  stat_cor(method = "pearson", ) +
  facet_wrap(~probe) #produce a plot for every probe
## `geom_smooth()` using formula 'y ~ x'

All correlations are significant (p < 0.05).

Correlation coefficients summarised below:

Probe Coefficient (R)
cg00149734 0.65
cg00198994 0.77
cg00618725 -0.37
cg07367735 0.71
cg11019008 0.63
cg13171643 0.75
cg14194875 -0.45
cg26976729 0.75

Majority of probes (6 out of 8) have quite moderate to strong positive correlations. The minority only have quite weak to moderate negative correlations.

While using probes which positively correlate methylation with gene expression may seem counter-intuitive (since it is well known that methylation is a marker for gene silencing), it may actually be biologically sound. Two of the probes here (cg00198994 and cg07367735) have been previously observed by Bacolod and Barany (2021) to have positive correlations with MGMT gene expression. These probes correspond to intronic regions within the actual gene body of MGMT. (MGMT paper, 33535955) Furthermore, as observed here, these probes have larger correlation magnitudes, than probes corresponding to promoter regions with only weak to moderate negative correlations. Bacolod and Barany (2021) suggests that this property makes intronic probes a better predictor for MGMT expression, and therefore suitable variables for the multiple regression model.


4. Multiple regression using the top correlated probes

# Pivot so that each row is one patient/sample, and the each probe has its own column
# and the value in probe represents its corresponding beta_val
widen_methyl_SKCM_patients_MGMT <- methyl_SKCM_patients_MGMT %>%
  pivot_wider(names_from=probe, values_from=beta_val)

# the top correlated probes (with the absolute value of the correlation coefficient > 0.5). 
# these would just be all the positive correlations
# Remove the negative correlations, which are also < |0.5|. 
# Do not include cg14194875', 'cg00618725'
multiple_reg_MGMT_SKCM <- lm(
  MGMT_norm_lcpm ~ cg00149734 + cg00198994 + cg07367735 + cg11019008 + cg13171643 + cg26976729,
  data=widen_methyl_SKCM_patients_MGMT
  )

summary(multiple_reg_MGMT_SKCM)
## 
## Call:
## lm(formula = MGMT_norm_lcpm ~ cg00149734 + cg00198994 + cg07367735 + 
##     cg11019008 + cg13171643 + cg26976729, data = widen_methyl_SKCM_patients_MGMT)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.5317 -0.5987 -0.0226  0.4897  2.8801 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.33597    0.27213  -1.235 0.217769    
## cg00149734  -0.05230    0.50165  -0.104 0.917030    
## cg00198994   2.27333    0.55382   4.105 4.99e-05 ***
## cg07367735   0.14101    0.43770   0.322 0.747508    
## cg11019008   0.05752    0.30339   0.190 0.849736    
## cg13171643   2.00785    0.57931   3.466 0.000591 ***
## cg26976729   1.31966    0.56793   2.324 0.020688 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9556 on 369 degrees of freedom
## Multiple R-squared:  0.6319, Adjusted R-squared:  0.6259 
## F-statistic: 105.6 on 6 and 369 DF,  p-value: < 2.2e-16

Coefficients:

Probe/Variable Coefficients (b)
intercept -0.33597
cg00149734 -0.05230
cg00198994 2.27333
cg07367735 0.14101
cg11019008 0.05752
cg13171643 2.00785
cg26976729 1.31966

Adjusted R-squared = 0.6259

p-value = 2.2e-16


Additional References

Bacolod, M. D., & Barany, F. (2021). MGMT Epigenetics: The Influence of Gene Body Methylation and Other Insights Derived from Integrated Methylomic, Transcriptomic, and Chromatin Analyses in Various Cancer Types. Current cancer drug targets, 21(4), 360–374. https://doi.org/10.2174/1568009621666210203111620